. Titanic dataset contains 891 rows and 12 columns.

. DataSet is downloaded from kaggle

. Notebook contains the indepth analysis of dataset including Data Cleaning, data wrangling, Data Visualization, Probability, Descriptive and Inferential Statistics.

Importing Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
library(tidyr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite

Loading and viewing the structure of data

titanic <- read.csv("C:/Users/Hp/Desktop/titanic.csv",sep = ",",stringsAsFactors = TRUE,na.strings = NA,header = T)

str(titanic)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
dim(titanic)
## [1] 891  12
summary(titanic)
##   PassengerId       Survived          Pclass     
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :446.0   Median :0.0000   Median :3.000  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309  
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000  
##                                                  
##                                     Name         Sex           Age       
##  Abbing, Mr. Anthony                  :  1   female:314   Min.   : 0.42  
##  Abbott, Mr. Rossmore Edward          :  1   male  :577   1st Qu.:20.12  
##  Abbott, Mrs. Stanton (Rosa Hunt)     :  1                Median :28.00  
##  Abelson, Mr. Samuel                  :  1                Mean   :29.70  
##  Abelson, Mrs. Samuel (Hannah Wizosky):  1                3rd Qu.:38.00  
##  Adahl, Mr. Mauritz Nils Martin       :  1                Max.   :80.00  
##  (Other)                              :885                NA's   :177    
##      SibSp           Parch             Ticket         Fare       
##  Min.   :0.000   Min.   :0.0000   1601    :  7   Min.   :  0.00  
##  1st Qu.:0.000   1st Qu.:0.0000   347082  :  7   1st Qu.:  7.91  
##  Median :0.000   Median :0.0000   CA. 2343:  7   Median : 14.45  
##  Mean   :0.523   Mean   :0.3816   3101295 :  6   Mean   : 32.20  
##  3rd Qu.:1.000   3rd Qu.:0.0000   347088  :  6   3rd Qu.: 31.00  
##  Max.   :8.000   Max.   :6.0000   CA 2144 :  6   Max.   :512.33  
##                                   (Other) :852                   
##          Cabin     Embarked
##             :687    :  2   
##  B96 B98    :  4   C:168   
##  C23 C25 C27:  4   Q: 77   
##  G6         :  4   S:644   
##  C22 C26    :  3           
##  D          :  3           
##  (Other)    :186

Cleaning and changing the schema of data

# Removing the NULL values 
titanic<- na.omit(titanic)
dim(titanic)
## [1] 714  12
#We can use sapply function to get the nº of missing values in our dataset
sapply(titanic,function(x) sum(is.na(x)))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0           0           0
# Converting to Factors

titanic$Pclass <- as.factor(titanic$Pclass)
titanic$Survived <- as.factor(titanic$Survived)
titanic$Sex <- as.factor(titanic$Sex)
titanic$Embarked <- as.factor(titanic$Embarked)

titanic$Survived<-ifelse(titanic$Survived==0,"Not Survivied","Survived")

titanic$Embarked<- ifelse(titanic$Embarked=="C","Cherbourg",(ifelse(titanic$Embarked=="Q","Queenstown","vSouthampton")))


Q1) What was the survival rate by gender?

###Using DPLYR
Survival_Count<-titanic %>%
  select(Sex,Survived) %>%
  group_by(Sex,Survived) %>%
  summarise(total = n())

Survival_Count
ABCDEFGHIJ0123456789
Sex
<fctr>
Survived
<chr>
total
<int>
femaleNot Survivied64
femaleSurvived197
maleNot Survivied360
maleSurvived93
### Using GGPLOT2
g2<- ggplot(titanic, aes(x = Sex, fill = Survived)) + 
  theme_economist() +
  geom_bar() +
  labs(y = "Passenger Count",
       title = "Titanic Survival Rates by Sex") +
  scale_y_continuous(limits = c(0,500),breaks = seq(0,500,100))

ggplotly(g2)

femalemale0100200300400500
Not SurviviedSurvived Titanic Survival Rates by Sex SexPassenger CountSurvived

=> By seeing the graph of survival rate of Gender, We can conclude that Survival count of female is more than male.


Q2) What was the survival rate by class of ticket?

### Using DPLYR
Survival_Count_By_Class <- titanic %>%
  select(Pclass,Survived) %>%
  group_by(Pclass,Survived) %>%
  summarise(Total= n())

Survival_Count_By_Class
ABCDEFGHIJ0123456789
Pclass
<fctr>
Survived
<chr>
Total
<int>
1Not Survivied64
1Survived122
2Not Survivied90
2Survived83
3Not Survivied270
3Survived85
### Using GGPLOT2
g3<-ggplot(titanic, aes(x = Pclass, fill = Survived)) + 
  theme_solarized() +
  geom_bar() +
  labs(y = "Passenger Count",
       title = "Titanic Survival Rates by Pclass") +
  scale_y_continuous(limits = c(0,400))

ggplotly(g3)

1230100200300400
Not SurviviedSurvivedTitanic Survival Rates by PclassPclassPassenger CountSurvived

=> By seeing the graph of survival rate by PClass, We can conclude that Survival count of people in Pclass 1 is more than others.


Q3) What was the survival rate?

g1<-ggplot(titanic, aes(x = Survived)) + 
  theme_wsj()+
  geom_bar(fill=rainbow(2),col="black") +
  labs(y = "Passenger Count",
       title = "Titanic Survival Rates")
ggplotly(g1)

Not SurviviedSurvived0100200300400
Titanic Survival Rates

=> By seeing the graph of survival rate, We can conclude that 285 people Survived and 418 were not Survived .


Q4) What is the distribution of passenger ages?

### using DPLYR
age_distribution <- titanic %>%
  select(Age) %>%
  mutate(distribution = cut(Age, breaks = seq(0,80,20)))%>%
  group_by(distribution) %>%
  summarise(total_count= n())

age_distribution <- na.omit(age_distribution)
age_distribution
ABCDEFGHIJ0123456789
distribution
<fctr>
total_count
<int>
(0,20]179
(20,40]385
(40,60]128
(60,80]22
### USing GGPLOT2
g5<-ggplot(titanic, aes(x = Age)) +
  theme_stata() +
  geom_histogram(binwidth = 5,col="black",fill=rainbow(17))+
  labs(y = "Passenger Count",
       x = "Age (binwidth = 5)",
       title = "Titanic Age Distribtion")

ggplotly(g5)

0204060800306090120
Titanic Age DistribtionAge (binwidth = 5)Passenger Count

#=> By seeing the graph of Distribution of Age, We can conclude that people between range (20,40) were highest compared to other age group.


We can show by Density Graph also

g6<-ggplot(titanic, aes(x = Age)) +
  theme_dark() +
  geom_density(alpha = 0.5) +
  labs(y = "Passenger Count",
       x = "Age (binwidth = 5)",
       title = "Titanic Age Distribtion") 

ggplotly(g6)

0204060800.000.010.020.03
Titanic Age DistribtionAge (binwidth = 5)Passenger Count

Q5) What was the survival rate by class of ticket and gender?

##Using DPLYR
Survival_Count_By_Sex_And_Class<- titanic %>%
  select(Sex,Survived,Pclass) %>%
  group_by(Pclass,Sex,Survived) %>%
  summarise(Survival_Count = n()) 

Survival_Count_By_Sex_And_Class
ABCDEFGHIJ0123456789
Pclass
<fctr>
Sex
<fctr>
Survived
<chr>
Survival_Count
<int>
1femaleNot Survivied3
1femaleSurvived82
1maleNot Survivied61
1maleSurvived40
2femaleNot Survivied6
2femaleSurvived68
2maleNot Survivied84
2maleSurvived15
3femaleNot Survivied55
3femaleSurvived47
### Using GGPLOT2 
g4<-ggplot(titanic, aes(x = Sex, fill = Survived)) + 
  theme_base() +
  facet_wrap(~ Pclass) +
  geom_bar(col="black") +
  labs(y = "Passenger Count",
       title = "Titanic Survival Rates by Pclass and Sex")

ggplotly(g4)

femalemale050100150200250femalemalefemalemale
Not SurviviedSurvived Titanic Survival Rates by Pclass and Sex SexPassenger Count123Survived

#=> By seeing the graph of survival rate by PClass, We can conclude that Survival count of people in PClass 1 are more compared to others because they pay more for PClass 1 ticket.


Q6) What are the survival rates by age?

titanic$Age<- round(titanic$Age)
g7<- ggplot(titanic, aes(x = Age, fill = Survived)) +
  theme_economist_white() +
  geom_histogram(bins = 30,col="black")+
  labs(y = "Passenger Count",
       x = "Age (binwidth = 5)",
       title = "Titanic Survival Rates by Age")

ggplotly(g7)

0204060800204060
Not SurviviedSurvived Titanic Survival Rates by Age Age (binwidth = 5)Passenger CountSurvived

=> By seeing the graph of survival rate by Age, We can conclude that maximum people ranging between (20,40) and maximum people who died are also in the same range.


#Distributions can even be shown using points
g8<-ggplot(age_distribution,aes(distribution,total_count,fill=distribution)) +
  theme_base() +
  geom_count()+
  labs(x = "Age Distribution",
       title = "Distribution by Total Counts")

ggplotly(g8)

(0,20](20,40](40,60](60,80]100200300400
(0,20](20,40](40,60](60,80] Distribution by Total Counts Age Distributiontotal_countdistributionn

We can See Box Plot for more details

g9<-ggplot(titanic, aes(x = Survived, y = Age,fill=Survived)) +
  theme_excel()+
  geom_boxplot() +
  labs(y = "Age",
       x = "Survived",
       title = "Titanic Survival Rates by Age")

ggplotly(g9)

Not SurviviedSurvived020406080
Not SurviviedSurvivedTitanic Survival Rates by AgeSurvivedAgeSurvived

We can see Violin Graph for the summary as well

g10<- ggplot(titanic,aes(titanic$Survived,titanic$Age,fill=titanic$Survived)) +
  theme_excel_new()+
  geom_abline(intercept =median(titanic$Age)) + 
  geom_violin() +
  labs(y = "Age distribution",
       title = "Titanic Survival Rates by Age")

ggplotly(g10)

Not SurviviedSurvived020406080
Not SurviviedSurvivedTitanic Survival Rates by Age

=> By seeing the survival rate by Age, We can conclude by seeing the median line that Non survival people are mostly around 25 and ranges between (20,40). And range of non Survival are (1,74).


Q34)Distribution based on Gender age fare survival

g36<-ggplot(titanic, aes(x =Fare,y=Age)) +
   theme_stata()  +
  facet_wrap(~Sex) +
  geom_point(aes(col=Survived),position = "jitter",pch=8,cex=1.5) +
  geom_smooth(method = "lm",se=F)+
  geom_hline(yintercept = median(titanic$Age))+
  labs(y = "Age",
       x = "Fare",
       title = "Titanic Age Vs Fare wrt Sex and Survived")


ggplotly(g36)

01002003004005000204060800100200300400500
Not SurviviedSurvivedTitanic Age Vs Fare wrt Sex and SurvivedFareAgefemalemaleSurvived

1)=> By seeing the plot we can see there is a horizontal line representing median of Age.

2)=> We can see the distribution of Fare and relation between Fare and Age using a Regression Line.

3)=> By seeing the regression line we can distinguish that males have paid more than females as the regression line is always above the median for males.

4)=> We can also see females are more survived compared to males.


Q7) What is the survival rates by age when segmented by gender and class of ticket?

g11<- ggplot(titanic, aes(x = Age, fill = Survived)) +
  theme_stata() +
  facet_wrap(Sex ~ Pclass) +
  geom_density(alpha = 0.5) +
  labs(y = "Total Count",
       x = "Age",
       title = "Titanic Survival Rates by Age, Pclass and Sex")

ggplotly(g11)

0.000.010.020.030.040204060800.000.010.020.030.04020406080020406080
Not SurviviedSurvivedTitanic Survival Rates by Age, Pclass and SexAgeTotal Countfemale1female2female3male1male2male3Survived

=> By seeing the graph of survival rate by Age,PClass and Sex, We can conclude that Survival count of female is more than male and specially in the range betwen (20,40) more womens survived but more males died between the same range.


Q8) Top 10 people who paid highest ticket price

top_10_ticket_price_payers <- titanic %>%
  select(Name,Fare)%>%
  arrange(desc(Fare)) %>%
  top_n(10)
## Selecting by Fare
top_10_ticket_price_payers$Fare<-round(top_10_ticket_price_payers$Fare)

top_10_ticket_price_payers
ABCDEFGHIJ0123456789
Name
<fctr>
Fare
<dbl>
Ward, Miss. Anna512
Cardeza, Mr. Thomas Drake Martinez512
Lesurer, Mr. Gustave J512
Fortune, Mr. Charles Alexander263
Fortune, Miss. Mabel Helen263
Fortune, Miss. Alice Elizabeth263
Fortune, Mr. Mark263
Ryerson, Miss. Emily Borie262
Ryerson, Miss. Susan Parker "Suzette"262
Baxter, Mr. Quigg Edmond248


Q9)Total people Embarked from each place

g12<-ggplot(titanic,aes(x =Embarked)) +
  theme_foundation() +
  geom_bar(col="black",aes(fill=Embarked))+
  labs(title = "Distribuion of Embarked")

ggplotly(g12)

CherbourgQueenstownvSouthampton0200400
CherbourgQueenstownvSouthamptonDistribuion of EmbarkedEmbarkedcountEmbarked

=> By seeing the graph of Embarked, We can conclude that most of the people boarded titanic from vSouthampton.


Q10) Checking Relationship between Age and Fare

sample_titanic1<- titanic[1:400,]

g13<- ggplot(sample_titanic1,aes(x=Age,y=Fare)) +
  theme_dark()+
  geom_point() +
  geom_smooth(method = lm)+
  labs(title = "Distribution of Age and Fare")

ggplotly(g13)
02040600100200300400500
Distribution of Age and FareAgeFare
# We can also see the corelation coeffcient to see the relation
cor(sample_titanic1$Age,sample_titanic1$Fare)
## [1] 0.09007522


We can see there is no a huge change b/w age and fare. As the corelation coeffcient is .09 so there is no Relation between Fare and Age


Q11)Distribution between Age and Survived

g14<- ggplot(titanic, aes(x = Age, fill = Survived)) +
  theme_economist()+
  geom_histogram(binwidth = 5,col="black") +
  labs(y = "Passenger Count",
       x = "Age (binwidth = 5)",
       title = "Titanic Survival Rates by Age")

ggplotly(g14)

0204060800306090120
Not SurviviedSurvived Titanic Survival Rates by Age Age (binwidth = 5)Passenger CountSurvived

Q12)Distribution of PClass and fare

fare_category <-cut(titanic$Fare,breaks = c(0,100,250,512),labels = c("Silver Price","Golden Price","Premium Price"))

table(fare_category)
## fare_category
##  Silver Price  Golden Price Premium Price 
##           659            39             6
titanic$fare_category<- fare_category

titanic<- na.omit(titanic)

g15<- ggplot(titanic) +
  theme_base()+
  geom_bar(aes(x=fare_category,fill=Pclass))+
  labs(ylab="Fair Distribution",title="Fare Distribution Vs Pclass")

ggplotly(g15)

Silver PriceGolden PricePremium Price0200400600
123 Fare Distribution Vs Pclass fare_categorycountPclass

1)=> By seeing the graph, We can see the Silver Tickets are brought more than others and PClass 3 are more.

2)=> Other important result we see , as Silver Ticket price is between (0,100) still we see PClass 1 and PClass 2 tickets in this price which tells us their is Fare discrimination based on PClass Tickets as for such a small amount we are getting PClass 1 ticket at some Embarked.


Q13)Distribution of Sex and fare

g16<-ggplot(titanic) +
  theme_economist_white()+
  geom_bar(aes(x=fare_category,fill=Sex))+
  labs(ylab="Fair Distribution", title = "Distribution of Fare by Sex")

ggplotly(g16)

Silver PriceGolden PricePremium Price0200400600
femalemale Distribution of Fare by Sex fare_categorycountSex

1)=> By seeing the graph, We can see Males have brought more Silver Tickets than Females.

2)=> Females have brought more Golden Tickets than Males.

3)=> Premium tickets are very less brought.


Q14)Top 10 Aged People in Male

Top_10_Aged_Male<- titanic %>%
  select(Name,Sex,Age) %>%
  filter(Sex=="male") %>%
  arrange(desc(Age)) %>%
  select(Name,Age) %>%
  top_n(10)
## Selecting by Age
Top_10_Aged_Male$Age<- round(Top_10_Aged_Male$Age)

Top_10_Aged_Male
ABCDEFGHIJ0123456789
Name
<fctr>
Age
<dbl>
Barkworth, Mr. Algernon Henry Wilson80
Svensson, Mr. Johan74
Goldschmidt, Mr. George B71
Artagaveytia, Mr. Ramon71
Connors, Mr. Patrick70
Mitchell, Mr. Henry Michael70
Crosby, Capt. Edward Gifford70
Wheadon, Mr. Edward H66
Ostby, Mr. Engelhart Cornelius65
Duane, Mr. Frank65


Details of Oldest Male on Titanic

sqldf("select * from titanic where Name='Barkworth, Mr. Algernon Henry Wilson'")
ABCDEFGHIJ0123456789
PassengerId
<int>
Survived
<chr>
Pclass
<fctr>
Name
<fctr>
Sex
<fctr>
Age
<dbl>
SibSp
<int>
Parch
<int>
Ticket
<fctr>
Fare
<dbl>
631Survived1Barkworth, Mr. Algernon Henry Wilsonmale80002704230


=> We can see from table that Mr Algernon Henry Wilson Barkworth is the oldest male on Ship and he survived.


Q15)Distribution of Aged Males

g17<-ggplot(data = Top_10_Aged_Male,aes(Age)) +
  geom_histogram(fill=rainbow(4),col="black",binwidth = 5) +
  theme_dark() +
  labs(title = "Aged People Age Distribution")

ggplotly(g17)


Q16)Top 10 Aged People in Females

Top_10_Aged_Female<- titanic %>%
  select(Name,Sex,Age) %>%
  filter(Sex=="female") %>%
  arrange(desc(Age)) %>%
  select(Name,Age) %>%
  top_n(10)
## Selecting by Age
Top_10_Aged_Female$Age<- round(Top_10_Aged_Female$Age)

Top_10_Aged_Female
ABCDEFGHIJ0123456789
Name
<fctr>
Age
<dbl>
Andrews, Miss. Kornelia Theodosia63
Turkula, Mrs. (Hedwig)63
Stone, Mrs. George Nelson (Martha Evelyn)62
Warren, Mrs. Frank Manley (Anna Sophia Atkinson)60
Bonnell, Miss. Elizabeth58
Lurette, Miss. Elise58
Graham, Mrs. William Thompson (Edith Junkins)58
Mack, Mrs. (Mary)57
Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)56
Hewlett, Mrs. (Mary D Kingcome)55


sqldf("select * from titanic where Name='Andrews, Miss. Kornelia Theodosia' ")
ABCDEFGHIJ0123456789
PassengerId
<int>
Survived
<chr>
Pclass
<fctr>
Name
<fctr>
Sex
<fctr>
Age
<dbl>
SibSp
<int>
Parch
<int>
Ticket
<fctr>
276Survived1Andrews, Miss. Kornelia Theodosiafemale631013502

=> We can see from table that Miss. Kornelia Theodosia, Andrews is the oldest female on Ship and she survived.


Q17)Relation between survived and fare wrt Pclass

g18<-ggplot(data = titanic,aes(Survived,log(Fare),fill=Pclass)) +
  theme_excel() +
  geom_violin() +
  labs(y="Fare Distribution",title = "Distribution of Fare and Survival")

ggplotly(g18)

Not SurviviedSurvived2345
123Distribution of Fare and SurvivalSurvivedFare DistributionPclass

#=> By seeing the graph of Fare and survival, We can conclude that as the PClass class is increase towards 1, Fare is more expensive and they have more chances of survival.


Q18)Relation between Pclass and Embarked wrt Survival

g19<-ggplot(data = titanic,aes(Pclass,fill=Survived)) +
  theme_dark()+
  facet_wrap(~Embarked) +
  geom_bar(col="black") +
  labs(title = "Distribution of PClass and Embarked wrt Survival")

ggplotly(g19)

1230100200300123123
Not SurviviedSurvivedDistribution of PClass and Embarked wrt SurvivalPclasscountCherbourgQueenstownvSouthamptonSurvived

1)=> By seeing the graph, We can conclude that for all Embarked people in PClass 1 survived more compared to other PClass and Embarked.

2)=> We can also see most of people who died in titanic, boarded their ship from vSouthampton and are from PClass 3.


Q19)Relation between Sex and Embarked wrt Survived

g20<- ggplot(data = titanic,aes(Sex,fill=Survived)) +
  theme_excel()+
  facet_wrap(~Embarked) +
  geom_histogram(stat="count",col="black") +
  labs(title = "Distribution of Sex and Embarked wrt Survived ")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g20)

femalemale0100200300femalemalefemalemale
Not SurviviedSurvivedDistribution of Sex and Embarked wrt SurvivedSexcountCherbourgQueenstownvSouthamptonSurvived

=> By seeing the graph, We can conclude that Females survived from all 3 Embarked and males died more from vSouthampton and we can predict that most died Males may be belonged to Pclass 3 as its survival rate is less.


Q20)Relation between Sex and Embarked wrt Pclass

g21<-ggplot(data = titanic,aes(Sex,fill=Pclass)) +
  theme_fivethirtyeight()+
  facet_wrap(~Embarked) +
  geom_bar(col="black") +
  labs(title = "Distribution of Sex and Embarked wrt Pclass")

ggplotly(g21)
## Warning: plotly.js does not (yet) support horizontal legend items 
## You can track progress here: 
## https://github.com/plotly/plotly.js/issues/53

femalemale0100200300femalemalefemalemale
123 Distribution of Sex and Embarked wrt Pclass CherbourgQueenstownvSouthamptonPclass

1)=> By seeing the graph, We can conclude that for Cherboung has almost same distribution of Pclass among both Sex categories.

2)=> In Queenstown almost all people in both Sex categories boarded ther ticket in PClass 3.

3)=> In vSouthampton males have slight edge for buying Pclass 1 ticket over females. And more males have buyed Pclass 3 ticket. Our prediction was right in the last question.


Q21)Relation between Age and Embarked wrt Pclass

g22<-ggplot(data = titanic,aes(Pclass,Age,fill=Pclass)) +
  theme_dark()+
  facet_wrap(~Embarked) +
  geom_count() +
  ylim(c(0,80))+
  ylab("Age Distribution") +
  labs(title = "Distribution of Age and Embarked wrt Pclass")

ggplotly(g22)

123020406080123123
123Distribution of Age and Embarked wrt PclassPclassAge DistributionCherbourgQueenstownvSouthamptonnPclass

1)=> By seeing the graph, We can see the different age range for buying different Pclass tickets. The thicker areas somewhere means people of same age are more who brought that Ticket.

2)=> We can also see for vSouthampton, for Pclass 1 age category is fully covered from 1 to 80 i.e from a small child to an oldage all types of people are there.

3)=> We can also see for vSouthampton, for Pclass 3 age category the area is very thicker between age 17 to 50 that concludes this range people have buyed more PClass 3 tickets.


Q22)Relation between PClass Age Sex Embarked

g23<-ggplot(data = titanic,aes(x=Age,fill=Pclass)) +
  theme_get()+
  facet_wrap(Sex~Embarked) +
  geom_bar(stat="count") +
  labs(title="Distribution of Age wrt PClass Sex Embarked")

ggplotly(g23)

051015020406080051015020406080020406080
123Distribution of Age wrt PClass Sex EmbarkedAgecountfemaleCherbourgfemaleQueenstownfemalevSouthamptonmaleCherbourgmaleQueenstownmalevSouthamptonPclass

#=> By seeing the graph, We can conclude the same thing which we got from the last graph.


Q23)Relation between PClass Survived Sex Embarked

g24<-ggplot(data = titanic,aes(Pclass,fill=Sex)) +
  theme_igray()+
  facet_wrap(Survived~Embarked) +
  geom_bar(col="black") +
  scale_y_continuous(limits = c(0,80),breaks = seq(0,80,20)) +
  labs(title = "Distribution of Pclass wrt Survived Sex Embarked ")

ggplotly(g24)

020406080123020406080123123
femalemaleDistribution of Pclass wrt Survived Sex EmbarkedPclasscountNot SurviviedCherbourgNot SurviviedQueenstownNot SurviviedvSouthamptonSurvivedCherbourgSurvivedQueenstownSurvivedvSouthamptonSex

=> By seeing the graph, We can see the similar conclution we got earlier that males died more than female of PClass 1 and PClass 2. And in Survival also female survived more than males in all PClass tickets.


Q24)Relation between Fare and Embarked

g25<-ggplot(data = titanic,aes(Fare,Embarked)) +
  theme_bw() +
  geom_jitter(stat = "identity",col="red") +
  geom_smooth(method = lm) +
  labs(title="Relation between Fare and Embarked")

ggplotly(g25)

0100200CherbourgQueenstownvSouthampton
Relation between Fare and EmbarkedFareEmbarked

1)=> By seeing the plot, the line shows the fare range as per differnt Embarked.

2)=> For Cherboung the variation of Fare is pretty high. Distribution of Fare are more between (0,100) so that must be Pclass 3 ticket. There are some more distribution above 100 upto 500 that shows Pclass 2 ticket will be aroung (100,250) and furture is Pclass 3

3)#=> For Queenstown the distribution of Fare are mostly between (0,100).We can conclude people boarded from there are less and brought PClass 3 tickets more compared to other tickets**

4)#=> For vSouthapton we see too much overplotting between (0,100) so that must be Pclass 3 ticket. There are some more distribution above 100 upto 260 approx.

5)#=>It gives an important conclution that PClass ticket prices vary for differnt Embarked as we have seen Pclass 1 ticket for vSouthampton but max price is only 260 here. And there may be discrimination of Fare based on Gender.


Q25)Ratio of Survival of Sex

titanic2 <- read.csv("C:/Users/Hp/Desktop/titanic.csv",sep = ",",stringsAsFactors = TRUE,na.strings = NA,header = T)


#removing the NULL values 
titanic2<- na.omit(titanic2)
dim(titanic2)
## [1] 714  12
#We can use sapply function to get the nº of missing values in our dataset
sapply(titanic2,function(x) sum(is.na(x)))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0           0           0
#converting to factors

titanic2$Pclass <- as.factor(titanic2$Pclass)
titanic2$Sex <- as.factor(titanic2$Sex)
titanic2$Embarked <- as.factor(titanic2$Embarked)

titanic2$Embarked<- ifelse(titanic2$Embarked=="C","Cherbourg",(ifelse(titanic2$Embarked=="Q","Queenstown","vSouthampton")))


survival<- titanic2%>%
  select(Sex,Survived)%>%
  group_by(Sex)%>%
  summarise(survival_rate = mean(Survived))

survival
ABCDEFGHIJ0123456789
Sex
<fctr>
survival_rate
<dbl>
female0.7547893
male0.2052980


We can see that Survival ration of female to male is nearly 3:1


Finding Survival Rate of Females to Males

survival<- titanic2%>%
  select(Sex,Survived)%>%
  group_by(Sex)%>%
  summarise(survival_rate = mean(Survived))

survival
ABCDEFGHIJ0123456789
Sex
<fctr>
survival_rate
<dbl>
female0.7547893
male0.2052980

We can see that Survival ration of female to male is nearly 3:1


Q26)Distribution of Sex and Survival

g26<- ggplot(data = titanic,aes(Sex,fill=Survived)) +
  theme_base() +
  geom_bar(col="black") +
  labs(title="Distribution of Sex and Survival")

ggplotly(g26)

femalemale0100200300400
Not SurviviedSurvived Distribution of Sex and Survival SexcountSurvived

The graphs tells that survival rate of female is more than male


Q27)Distribution of Fare vs Survival

g27<-ggplot(data = titanic,aes(Fare,fill=Survived)) +
  theme_economist() +
  geom_histogram(bins = 30,col="black") +
  scale_y_continuous(limits = c(0,120)) +
  labs(title = "Distribution of Fare wrt to Survival")

ggplotly(g27)

01002000255075100125
Not SurviviedSurvived Distribution of Fare wrt to Survival FarecountSurvived

The graph tells that, as fare increases Survival increases


Q28)Distribution of survival specific to sex

g28<-ggplot(data = titanic,aes(Fare,fill=Survived)) +
  theme_stata() +
  facet_wrap(~Sex)+
  geom_histogram(bins = 30,col="black") +
  scale_y_continuous(limits = c(0,90),breaks =seq(0,90,30)) +
  labs(x="Fare",y="Survival Count",title = "Distribution of survival specific to sex")

ggplotly(g28)

010020003060900100200
Not SurviviedSurvivedDistribution of survival specific to sexFareSurvival CountfemalemaleSurvived

As fare increases Survival increases in case of females but not males and expensive fare tickets are more likely brought by men.


Q29)Distribution of survival wrt SibSp

g29<-ggplot(data = titanic,aes(SibSp,fill=Survived)) +  
  theme_solarized_2() +
  geom_bar(col="black")+
  scale_y_continuous(limits = c(0,500),breaks =seq(0,500,100)) +
  labs(title = "Distribution of SibSp wrt Survived")

ggplotly(g29)

0240100200300400500
Not SurviviedSurvivedDistribution of SibSp wrt SurvivedSibSpcountSurvived

The graph tells that, as the siblings increase the chances of survival decrease


Q30)Distribution of survival wrt Parch

g30<-ggplot(data = titanic,aes(Parch,fill=Survived)) +  
  theme_solarized_2() +
  geom_bar(col="black")+
  scale_y_continuous(limits = c(0,200),breaks =seq(0,200,50)) +
  labs(title = "Distribution of Parch wrt Survived")

ggplotly(g30)

0246050100150200
Not SurviviedSurvivedDistribution of Parch wrt SurvivedParchcountSurvived

The graph tells that, as the family increase the chances of survival decrease


Q31)Relationship between family size and survivals

# Adding a calculated field column to dataset
titanic = titanic%>%
  mutate(family_size = SibSp + Parch)

g31<-ggplot(data = titanic,aes(family_size,fill=Survived)) +
  theme_base() + 
  geom_bar(col="black")+
  labs(title = "Distribution of Family Size wrt Survived",y="Survival Count") +
  scale_y_continuous(limits = c(0,450),breaks = seq(0,450,100))

ggplotly(g31)

02460100200300400
Not SurviviedSurvived Distribution of Family Size wrt Survived family_sizeSurvival CountSurvived

The graph tells that, as family size increase survival rate decreases.


Q32)Distribution of cabin locations.

A = titanic%>%
  select(Cabin) %>%
  group_by(Cabin) %>%
  summarise(total = n()) %>%
  arrange(desc(total)) %>%
  top_n(8)
## Selecting by total
# Filling the missing column value with a variable
#class(A)
A=as.matrix(A)
A[1,1] <- "X"
A=as.data.frame(A)
#class(A)

A
ABCDEFGHIJ0123456789
Cabin
<fctr>
total
<fctr>
X523
B96 B984
C23 C25 C274
G64
C22 C263
D3
F23
F333
g32<-ggplot(data = A,aes(A$Cabin,A$total)) +
  theme_dark() +
  geom_histogram(stat = "identity",fill=rainbow(8),col="black") +
  labs(title = "Distribution of Cabin")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g32)

B96 B98C22 C26C23 C25 C27DF2F33G6X 3 4523
Distribution of CabinA$CabinA$total

1)The graph tells that, most of Cabins has the space of 3 to 4 people living.

2)The X cabin represents the missing names of Cabin.


Q32)Top 5 Highest Tickets Sell

top_5_highest_ticket_sell <- sqldf("select Ticket,count(Ticket) as total from titanic2 group by Ticket order by total desc limit 5")
top_5_highest_ticket_sell
ABCDEFGHIJ0123456789
Ticket
<fctr>
total
<int>
3470827
31012956
3470886
CA 21446
3826525
#Ploting Graph
g33<-ggplot(data = top_5_highest_ticket_sell ,aes(Ticket,total)) +
  theme_economist_white() +
  geom_bar(stat="identity",fill=rainbow(5) ,col="black") +
  ylim(c(0,8)) +
  labs(title = "Top 5 ticket Sold")

ggplotly(g33)

3101295347082347088382652CA 214402468
Top 5 ticket Sold Tickettotal

The graph tells that, ticket number 347082 is the highest sold ticket.


Q33)Top 10 Initials Sir Names (Can help to predict survival)

titanic2<-separate(data = titanic2,col =Name,into= c("LastName", "Initial","FirstName","MiddleName"),sep = " ")
## Warning: Expected 4 pieces. Additional pieces discarded in 172 rows [2,
## 4, 8, 9, 14, 15, 17, 23, 30, 32, 33, 34, 36, 39, 40, 66, 77, 93, 95,
## 105, ...].
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 214 rows
## [3, 11, 16, 19, 31, 42, 43, 46, 47, 49, 53, 57, 58, 61, 62, 65, 69, 70, 74,
## 78, ...].
top_10_Initial_Sir_Names<- titanic2%>%
  select(Initial,Survived) %>%
  group_by(Initial) %>%
  summarise(total = n(),survived_total= sum(Survived)) %>%
  arrange(desc(total)) %>%
  top_n(5)
## Selecting by survived_total
top_10_Initial_Sir_Names
ABCDEFGHIJ0123456789
Initial
<chr>
total
<int>
survived_total
<int>
Mr.38566
Miss.143104
Mrs.10483
Master.3621
Dr.63
#Ploting the Graph
g34<-ggplot(data = top_10_Initial_Sir_Names,aes(Initial,total,fill=Initial)) +
  theme_foundation() +
  geom_count()+
  labs(title = "Distribution of Sir Names")

ggplotly(g34)

Dr.Master.Miss.Mr.Mrs.0100200300400
Dr.Master.Miss.Mr.Mrs.Distribution of Sir NamesInitialtotalInitialn

=>1)The graph tells that, Sirname ‘Mr’ are more compared to others.

2)Sirname ‘Dr’ there were very less Doctors that that taveled.

3)Sirname with ‘Miss’ is the 2nd most travelled people that shows there were many unmarried people also.


Q33)Total Vs Survival Count

barplot(top_10_Initial_Sir_Names$total,ylim = c(0,450),col=rainbow(5),legend = top_10_Initial_Sir_Names$Initial)
lines(top_10_Initial_Sir_Names$total,type="o",col="blue") +
lines(top_10_Initial_Sir_Names$survived_total,type="o",col="white") +
title("Total Vs Survival Count")

## integer(0)


The bar and line plot shows the total Sir name count wrt their survival count.


Q35) We can find the Relations

##relation between Survival PClass 
titanic2$Pclass<-as.numeric(titanic2$Pclass)
cor(titanic2$Survived,titanic2$Pclass)
## [1] -0.3596527


The correlation between Survival and Pclass is moderate negative. That shows as PClass increases From 1 to 2 to 3 , Survival rate decreases.


##relation between Survival Fare 
cor(titanic2$Survived,titanic2$Fare)
## [1] 0.2681886


The correlation between Survival and Pclass is moderate positive. That shows as Fare increases, Survival rate also increases.


Q36Number of ‘Child’,‘Adult’,‘Elder’ on the ship

# Creating a range distribuion of Age
age_categories <- cut(titanic$Age,breaks = c(0,20,50,80),labels = c("Children","Adults","OldAged"))

table(age_categories)
## age_categories
## Children   Adults  OldAged 
##      178      461       64
titanic$age_category<- age_categories


titanic<- na.omit(titanic)

nrow(titanic)
## [1] 703
sapply(titanic,function(x) sum(is.na(x)))
##   PassengerId      Survived        Pclass          Name           Sex 
##             0             0             0             0             0 
##           Age         SibSp         Parch        Ticket          Fare 
##             0             0             0             0             0 
##         Cabin      Embarked fare_category   family_size  age_category 
##             0             0             0             0             0


g37<-ggplot(titanic,aes(age_category)) +
  theme_pander() +
  geom_bar(aes(fill=age_category),col="black") +
  scale_y_continuous(limits = c(0,500),breaks = seq(0,500,100)) +
  labs(title = "Distribution of Age Category")

ggplotly(g37)

ChildrenAdultsOldAged0100200300400500
ChildrenAdultsOldAged Distribution of Age Category age_categorycountage_category

=> By seeing the graph of Age category, We can conclude that most of the people are from Adult Category.


Q37Age Categories based on the Embarked wrt to Sex

#Using DPLYR
age_cateory_by_Embarked_Sex<- titanic%>%
  select(age_category,Embarked,Sex) %>%
  group_by(age_category,Embarked,Sex)%>%
  summarise(total = n())
age_cateory_by_Embarked_Sex
ABCDEFGHIJ0123456789
age_category
<fctr>
Embarked
<chr>
Sex
<fctr>
total
<int>
ChildrenCherbourgfemale21
ChildrenCherbourgmale10
ChildrenQueenstownfemale5
ChildrenQueenstownmale5
ChildrenvSouthamptonfemale51
ChildrenvSouthamptonmale86
AdultsCherbourgfemale33
AdultsCherbourgmale47
AdultsQueenstownfemale7
AdultsQueenstownmale8
#Using GGPLOT2
g38<- ggplot(titanic,aes(age_category,fill=Sex)) +
  theme_dark() +
  facet_wrap(~Embarked) +
  geom_histogram(stat="count",col="black") +
  labs(title = "Age Categories based on the Embarked wrt to Sex")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g38)

ChildrenAdultsOldAged0100200300ChildrenAdultsOldAgedChildrenAdultsOldAged
femalemaleAge Categories based on the Embarked wrt to Sexage_categorycountCherbourgQueenstownvSouthamptonSex

=> By seeing the graph, We can conclude that most of the most of male People and specially adults boarded titanic from vSouthampton.


Q38Age Categories based on the Embarked wrt to Pclass

#Using DPLYR
age_cateory_by_Embarked_Pclass<- titanic%>%
  select(age_category,Embarked,Pclass) %>%
  group_by(age_category,Embarked,Pclass)%>%
  summarise(total = n())
age_cateory_by_Embarked_Pclass
ABCDEFGHIJ0123456789
age_category
<fctr>
Embarked
<chr>
Pclass
<fctr>
total
<int>
ChildrenCherbourg17
ChildrenCherbourg24
ChildrenCherbourg320
ChildrenQueenstown310
ChildrenvSouthampton114
ChildrenvSouthampton231
ChildrenvSouthampton392
AdultsCherbourg149
AdultsCherbourg211
AdultsCherbourg320
#Using GGPLOT2
g39<-ggplot(titanic,aes(Pclass)) +
  theme_dark() +
  facet_wrap(~Embarked) +
  geom_histogram(stat="count",aes(fill=age_category),col="black") +
  labs(title = "Age Categories based on the Embarked wrt to Pclass")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g39)

1230100200300123123
ChildrenAdultsOldAgedAge Categories based on the Embarked wrt to PclassPclasscountCherbourgQueenstownvSouthamptonage_category

1)=> By seeing the graph, We can see that mostly Adults traveled the titanic having any of the PClass ticket.

2)=> Most of the childrens boarded titanic from vSouthampton and with PClass 3. 3)=> OldAge people are less and have traveled from all the Embarked and mostly from vSouthampton.


Q38Age Categories based on the Embarked wrt to Sex and Pclass

#Using DPLYR

age_cateory_by_Embarked_Sex_Pclass<- titanic%>%
  select(age_category,Sex,Embarked,Pclass) %>%
  group_by(age_category,Sex,Embarked,Pclass)%>%
  summarise(total = n())
age_cateory_by_Embarked_Sex_Pclass
ABCDEFGHIJ0123456789
age_category
<fctr>
Sex
<fctr>
Embarked
<chr>
Pclass
<fctr>
total
<int>
ChildrenfemaleCherbourg15
ChildrenfemaleCherbourg23
ChildrenfemaleCherbourg313
ChildrenfemaleQueenstown35
ChildrenfemalevSouthampton19
ChildrenfemalevSouthampton213
ChildrenfemalevSouthampton329
ChildrenmaleCherbourg12
ChildrenmaleCherbourg21
ChildrenmaleCherbourg37
#Using GGPLOT
g40<-ggplot(titanic,aes(Pclass,fill=age_category)) +
  theme_dark() +
  facet_wrap(Sex~Embarked) +
  geom_bar(col="black") +
  labs(title = "Age Categories based on the Embarked wrt to Sex and Pclass")

ggplotly(g40)

050100150200123050100150200123123
ChildrenAdultsOldAgedAge Categories based on the Embarked wrt to Sex and PclassPclasscountfemaleCherbourgfemaleQueenstownfemalevSouthamptonmaleCherbourgmaleQueenstownmalevSouthamptonage_category

1)=> By seeing the graph, We can see that in Cherboung and Queenstown the distribution of Age cateogory among males and females are almost wrt to PClass.

2)=> In vSouthampton Adult Males boarded more than Adult Females wrt to all PClass. And this is same for other age cateogories also as males of all category are more than females.


Q39Age Categories based on the Embarked wrt to Survived

#Using DPLYR
age_cateory_by_Embarked_Survived<- titanic%>%
  select(age_category,Embarked,Survived) %>%
  group_by(age_category,Embarked,Survived)%>%
  summarise(total = n())
age_cateory_by_Embarked_Survived
ABCDEFGHIJ0123456789
age_category
<fctr>
Embarked
<chr>
Survived
<chr>
total
<int>
ChildrenCherbourgNot Survivied9
ChildrenCherbourgSurvived22
ChildrenQueenstownNot Survivied6
ChildrenQueenstownSurvived4
ChildrenvSouthamptonNot Survivied82
ChildrenvSouthamptonSurvived55
AdultsCherbourgNot Survivied35
AdultsCherbourgSurvived45
AdultsQueenstownNot Survivied11
AdultsQueenstownSurvived4
#Using GGPLOT
g41<-ggplot(titanic,aes(Survived,fill=age_category)) +
  theme_dark() +
  facet_wrap(~Embarked) +
  geom_histogram(stat="count",col="black") +
  labs(title = "Age Categories based on the Embarked wrt to Survived")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g41)

Not SurviviedSurvived0100200300Not SurviviedSurvivedNot SurviviedSurvived
ChildrenAdultsOldAgedAge Categories based on the Embarked wrt to SurvivedSurvivedcountCherbourgQueenstownvSouthamptonage_category

1)=> By seeing the graph, We can see that in all Embarked Adults are survived most and next are childrens.

2)=> The result is same in case of death as Adults have died more and the next is Children.


Q40) Age Categories based on the Embarked wrt to Sex Survived and Pclass

#Using DPLYR
age_cateory_by_Embarked_Sex_Survived_Pclass<- titanic%>%
  select(age_category,Embarked,Sex,Pclass,Survived) %>%
  group_by(age_category,Embarked,Sex,Pclass,Survived)%>%
  summarise(total = n())
head(age_cateory_by_Embarked_Sex_Survived_Pclass)
ABCDEFGHIJ0123456789
age_category
<fctr>
Embarked
<chr>
Sex
<fctr>
Pclass
<fctr>
Survived
<chr>
total
<int>
ChildrenCherbourgfemale1Survived5
ChildrenCherbourgfemale2Survived3
ChildrenCherbourgfemale3Not Survivied4
ChildrenCherbourgfemale3Survived9
ChildrenCherbourgmale1Not Survivied1
ChildrenCherbourgmale1Survived1
#Using GGPLOT
g42<-ggplot(titanic,aes(Pclass,fill=age_category)) +
  theme_dark() +
  facet_wrap(Sex~Embarked) +
  geom_bar(stat="count",col="black") +
  labs(title = "Age Categories based on the Pclass wrt Embarked,Sex")

ggplotly(g42)

050100150200123050100150200123123
ChildrenAdultsOldAgedAge Categories based on the Pclass wrt Embarked,SexPclasscountfemaleCherbourgfemaleQueenstownfemalevSouthamptonmaleCherbourgmaleQueenstownmalevSouthamptonage_category

=> By seeing the graph, We get similar results which we got before.


Q41)Age Categories based on the Pclass wrt Embarked,Survived

#Using DPLYR
age_cateory_by_Embarked_Survived_Pclass<- titanic%>%
  select(age_category,Embarked,Pclass,Survived) %>%
  group_by(age_category,Embarked,Pclass,Survived)%>%
  summarise(total = n())
head(age_cateory_by_Embarked_Survived_Pclass)
ABCDEFGHIJ0123456789
age_category
<fctr>
Embarked
<chr>
Pclass
<fctr>
Survived
<chr>
total
<int>
ChildrenCherbourg1Not Survivied1
ChildrenCherbourg1Survived6
ChildrenCherbourg2Survived4
ChildrenCherbourg3Not Survivied8
ChildrenCherbourg3Survived12
ChildrenQueenstown3Not Survivied6
#Using GGPLOT
g43<-ggplot(titanic,aes(Pclass,fill=age_category)) +
  theme_dark() +
  facet_wrap(Survived~Embarked) +
  geom_bar(stat="count",col="black") +
  labs(title = "Age Categories based on the Pclass wrt Embarked,Survived")

ggplotly(g43)

050100150200123050100150200123123
ChildrenAdultsOldAgedAge Categories based on the Pclass wrt Embarked,SurvivedPclasscountNot SurviviedCherbourgNot SurviviedQueenstownNot SurviviedvSouthamptonSurvivedCherbourgSurvivedQueenstownSurvivedvSouthamptonage_category

=> By seeing the graph, We can see the similar results as before.


Q42)Age Categories based on the Embarked wrt to Fare

g44<-ggplot(titanic,aes(age_category,Fare)) +
  theme_stata() +
  facet_wrap(~Embarked) +
  geom_violin(color="green",fill="blue")+
  labs(title = "Age Categories based on the Embarked wrt to Fare")

ggplotly(g44)

ChildrenAdultsOldAged0100200ChildrenAdultsOldAgedChildrenAdultsOldAged
Age Categories based on the Embarked wrt to Fareage_categoryFareCherbourgQueenstownvSouthampton

1)=> By seeing the graph, We can see that fares ranges with huge margin for age category wrt differnt Embarked.

2)=> We can conclude that there is discrimination of fares among different Embarked for diffrent age categories specially Adults.


Q43) Age Categories based on the Embarked wrt to Fare and Pclass

g45<-ggplot(titanic,aes(age_category,Fare,fill=Pclass)) +
  theme_economist() +
  facet_wrap(~Embarked) +
  geom_violin() +
  labs(title = "Age Categories based on the Embarked wrt to Fare and Pclass")
  
ggplotly(g45)

ChildrenAdultsOldAged0100200ChildrenAdultsOldAgedChildrenAdultsOldAged
123 Age Categories based on the Embarked wrt to Fare and Pclass age_categoryFareCherbourgQueenstownvSouthamptonPclass

=> By seeing the graph, We can see that fares ranges age category wrt PClass and differnt Embarked.

=>PClass 3 rates are somewhat similar there is not a huge discrimination of fare in this. Other PClass have some discrimiantion wrt Embarked and Age Category.


Q44) Age Categories based on the Survived wrt to Fare and Pclass

g46<-ggplot(titanic,aes(age_category,Fare,fill=Pclass)) +
  theme_economist() +
  facet_wrap(~Survived) +
  geom_violin(col="brown") +
  scale_y_continuous(limit=c(0,300))+
  labs(title = "Age Categories based on the Survived wrt to Fare and Pclass")
  
ggplotly(g46)

ChildrenAdultsOldAged0100200300ChildrenAdultsOldAged
123 Age Categories based on the Survived wrt to Fare and Pclass age_categoryFareNot SurviviedSurvivedPclass

=> By seeing the graph, We can see in Survival we see our previous result that with high PClass i.e 1 the probability of survial increasesa as they pay more compared to others. And it decreases as the PClass goes down as they pay less.